Libraries
library <- function(...) {suppressPackageStartupMessages(base::library(...))}
library(tidyverse)
library(dplyr)
library(knitr)
library(tidyr)
library(rmarkdown)
library(janitor)
library(scales)
library(tidytext)
library(ggforce)Dataset: US Real Estate Dataset
Your report must be of high quality, meaning that your report:
knitr::kable() printing.rmarkdown::paged_table()library <- function(...) {suppressPackageStartupMessages(base::library(...))}
library(tidyverse)
library(dplyr)
library(knitr)
library(tidyr)
library(rmarkdown)
library(janitor)
library(scales)
library(tidytext)
library(ggforce)In this data exploration we are looking at the US Real Estate market with use of a dataset from kaggle published by Ahmed Shahriar Sakib. It contains over 2.2 Million Real Estate listings broken down to State, Size, Price (among other factors). (Source: https://www.kaggle.com/datasets/ahmedshahriarsakib/usa-real-estate-dataset/data)
Download and import your dataset. Assign each variable appropriate/correct data types. Discuss/explain your choice for each variable (groups) in the main text.
# Import Data
data = read.csv("data/realtor-data.zip.csv")# Keep relevant Columns
data = subset(data, select = c(status, price, bed, bath, acre_lot, city, state, house_size))str_out <- capture.output(str(data))
kable(data.frame(Output = str_out), col.names = "Structure Output")| Structure Output |
|---|
| ‘data.frame’: 2226382 obs. of 8 variables: |
| $ status : chr “for_sale” “for_sale” “for_sale” “for_sale” … |
| $ price : num 105000 80000 67000 145000 65000 179000 50000 71600 100000 300000 … |
| $ bed : int 3 4 2 4 6 4 3 3 2 5 … |
| $ bath : int 2 2 1 2 2 3 1 2 1 3 … |
| $ acre_lot : num 0.12 0.08 0.15 0.1 0.05 0.46 0.2 0.08 0.09 7.46 … |
| $ city : chr “Adjuntas” “Adjuntas” “Juana Diaz” “Ponce” … |
| $ state : chr “Puerto Rico” “Puerto Rico” “Puerto Rico” “Puerto Rico” … |
| $ house_size: num 920 1527 748 1800 NA … |
# Assign Data Types
data$status = as.factor(data$status)
data$city = as.factor(data$city)
data$state = as.factor(data$state)str_out <- capture.output(str(data))
kable(data.frame(Output = str_out), col.names = "Structure Output")| Structure Output |
|---|
| ‘data.frame’: 2226382 obs. of 8 variables: |
| $ status : Factor w/ 3 levels “for_sale”,“ready_to_build”,..: 1 1 1 1 1 1 1 1 1 1 … |
| $ price : num 105000 80000 67000 145000 65000 179000 50000 71600 100000 300000 … |
| $ bed : int 3 4 2 4 6 4 3 3 2 5 … |
| $ bath : int 2 2 1 2 2 3 1 2 1 3 … |
| $ acre_lot : num 0.12 0.08 0.15 0.1 0.05 0.46 0.2 0.08 0.09 7.46 … |
| $ city : Factor w/ 20099 levels ““,”100 89 Lower Shepard Creek Road”,..: 94 94 8790 14316 10938 15873 3276 14316 14316 9696 … |
| $ state : Factor w/ 56 levels ““,”Alabama”,“Alaska”,..: 43 43 43 43 43 43 43 43 43 43 … |
| $ house_size: num 920 1527 748 1800 NA … |
before_rows <- nrow(data)
data <- na.omit(data)
after_rows <- nrow(data)
kable(data.frame(Description = c("Before NA removal", "After NA removal"),
Rows = c(before_rows, after_rows)))| Description | Rows |
|---|---|
| Before NA removal | 2226382 |
| After NA removal | 1360716 |
The dataset now has 1360716 observations and 8 variables after removing rows with missing values.
data = data |>
mutate(price_per_sqm = price/house_size)
# Filter min and max values
data = data |>
filter(price > 10000 & price < 1000000000)paged_table(data)str(data)'data.frame': 1360076 obs. of 9 variables:
$ status : Factor w/ 3 levels "for_sale","ready_to_build",..: 1 1 1 1 1 1 1 1 1 1 ...
$ price : num 105000 80000 67000 145000 179000 50000 71600 100000 300000 89000 ...
$ bed : int 3 4 2 4 4 3 3 2 5 3 ...
$ bath : int 2 2 1 2 3 1 2 1 3 2 ...
$ acre_lot : num 0.12 0.08 0.15 0.1 0.46 ...
$ city : Factor w/ 20099 levels "","100 89 Lower Shepard Creek Road",..: 94 94 8790 14316 15873 3276 14316 14316 9696 8556 ...
$ state : Factor w/ 56 levels "","Alabama","Alaska",..: 43 43 43 43 43 43 43 43 43 43 ...
$ house_size : num 920 1527 748 1800 2520 ...
$ price_per_sqm: num 114.1 52.4 89.6 80.6 71 ...
- attr(*, "na.action")= 'omit' Named int [1:865666] 5 24 43 44 52 60 61 62 63 64 ...
..- attr(*, "names")= chr [1:865666] "5" "24" "43" "44" ...
We chose the following data types for our variables:
The following variables of the dataset will be looked into:
price - The price for which the item was listed on the market
status - The status if the house is already sold or still for sale
acre_lot - The size of the land / lot on which the house is located in acres
state - The state in which the house is located
house size - The size of the house in square feet
In this section we will cover the summary of our cleaned dataset. We will explore basic statistical values from our data.
data |>
janitor::clean_names() |>
mutate(row = row_number() |> factor()) |>
pivot_longer(cols = where(is.numeric)) |>
group_by(name) |>
summarize(N = n(),
min = min(value),
mean = mean(value),
median = median(value),
max = max(value),
st.dev = sd(value)
) |>
knitr::kable(digits = 2)| name | N | min | mean | median | max | st.dev |
|---|---|---|---|---|---|---|
| acre_lot | 1360076 | 0.00 | 12.75 | 0.21 | 100000.0 | 799.83 |
| bath | 1360076 | 1.00 | 2.54 | 2.00 | 222.0 | 1.36 |
| bed | 1360076 | 1.00 | 3.39 | 3.00 | 444.0 | 1.43 |
| house_size | 1360076 | 100.00 | 2119.39 | 1812.00 | 1560780.0 | 4069.66 |
| price | 1360076 | 10400.00 | 573730.62 | 379000.00 | 515000000.0 | 1226596.98 |
| price_per_sqm | 1360076 | 0.02 | 262.42 | 197.42 | 491412.2 | 500.86 |
data |>
clean_names() |>
pivot_longer(cols = where(is.numeric)) |>
ggplot(aes(x = value, fill = name)) +
geom_histogram(bins = 30, alpha = 0.7, color = "white") +
scale_x_log10(labels = label_comma()) + # 👈 echte Werte, log-Skala
facet_wrap(~ name, scales = "free_x") +
theme_minimal() +
labs(
title = "Distribution of Numerical Variables (logarithmic scale)",
x = "Value",
y = "Count"
) +
theme(
legend.position = "none",
axis.text.x = element_text(angle = 25, hjust = 1)
)top_n_per_var <- 10
nominal_summary <- data |>
clean_names() |>
select(where(is.factor), price) |>
pivot_longer(cols = where(is.factor),
names_to = "Variable",
values_to = "Category") |>
group_by(Variable, Category) |>
summarise(
Count = n(),
Percent = round(100 * Count / nrow(data), 2),
Mean_Price = round(mean(price, na.rm = TRUE), 0),
.groups = "drop"
) |>
group_by(Variable) |>
slice_max(order_by = Count, n = top_n_per_var, with_ties = FALSE) |>
ungroup()
kable(
nominal_summary,
caption = paste0(
"Top ", top_n_per_var,
" categories per factor variable (counts, share %, and mean price)"
),
digits = 2,
align = c("l", "l", "r", "r", "r")
)| Variable | Category | Count | Percent | Mean_Price |
|---|---|---|---|---|
| city | Houston | 19226 | 1.41 | 477651 |
| city | Tucson | 7876 | 0.58 | 384816 |
| city | Phoenix | 7694 | 0.57 | 543665 |
| city | Los Angeles | 7556 | 0.56 | 1885626 |
| city | Dallas | 7510 | 0.55 | 587276 |
| city | Philadelphia | 7336 | 0.54 | 338467 |
| city | Richmond | 6592 | 0.48 | 392538 |
| city | Orlando | 6281 | 0.46 | 418841 |
| city | Fort Worth | 6171 | 0.45 | 389780 |
| city | Saint Louis | 5970 | 0.44 | 250029 |
| state | California | 170954 | 12.57 | 1095518 |
| state | Texas | 145394 | 10.69 | 451253 |
| state | Florida | 127675 | 9.39 | 649826 |
| state | Arizona | 54488 | 4.01 | 552916 |
| state | Pennsylvania | 51922 | 3.82 | 343792 |
| state | New York | 50935 | 3.75 | 669257 |
| state | Georgia | 49234 | 3.62 | 422988 |
| state | Illinois | 46901 | 3.45 | 357316 |
| state | Washington | 46450 | 3.42 | 728113 |
| state | Virginia | 44236 | 3.25 | 547994 |
| status | for_sale | 750493 | 55.18 | 621383 |
| status | sold | 609583 | 44.82 | 515063 |
nominal_summary <- nominal_summary |>
group_by(Variable) |>
mutate(Category = forcats::fct_reorder(Category, Count),
Category = factor(Category, levels = unique(Category))) |>
ungroup()
# 🔹 Plot: Facets untereinander, mit eigener x-Skala und y-Skala pro Variable
ggplot(nominal_summary, aes(x = Count, y = Category, fill = Variable)) +
geom_col(show.legend = FALSE, alpha = 0.8, width = 0.7) +
facet_wrap(~ Variable, ncol = 1, scales = "free", drop = TRUE) +
scale_x_continuous(labels = label_comma()) + # 👈 Tausendertrennung, keine 1e+05
theme_minimal() +
labs(
title = "Top Categories per Factor Variable",
x = "Count",
y = "Category"
) +
theme(
panel.spacing.y = unit(1, "lines"),
strip.text = element_text(size = 12, face = "bold"),
axis.text.y = element_text(size = 8),
plot.margin = margin(5, 15, 5, 5)
)library(dplyr)
library(tibble)
valid_states <- tibble(
state_name = c(state.name, "District of Columbia"),
state_abbr = c(state.abb, "DC")
)library(dplyr)
library(plotly)
map_price <- data |>
group_by(state) %>%
summarise(avg_price = mean(price, na.rm = TRUE), .groups = "drop") |>
inner_join(valid_states, by = c("state" = "state_name")) |>
mutate(avg_price_k = avg_price / 1000)
plot_ly(
map_price,
type = "choropleth",
locationmode = "USA-states",
locations = ~state_abbr,
z = ~avg_price_k,
text = ~paste0(state, "<br>Avg Price: $", round(avg_price_k, 1), "K"),
colorscale = "Blues",
colorbar = list(title = "Avg Price ($K)")
) |>
plotly::layout(
title = list(text = "Average Property Price by U.S. State"),
geo = list(scope = "usa", projection = list(type = "albers usa"))
)map_size <- data %>%
group_by(state) %>%
summarise(avg_size = mean(house_size, na.rm = TRUE), .groups = "drop") %>%
inner_join(valid_states, by = c("state" = "state_name"))
plot_ly(
map_size,
type = "choropleth",
locationmode = "USA-states",
locations = ~state_abbr,
z = ~avg_size,
text = ~paste0(state, "<br>Avg Size: ", round(avg_size), " sqft"),
colorscale = "Greens",
colorbar = list(title = "Avg Size (sqft)")
) %>%
plotly::layout(
title = list(text = "Average House Size by U.S. State"),
geo = list(scope = "usa", projection = list(type = "albers usa"))
)map_extremes <- data %>%
group_by(state) %>%
summarise(
min_price = suppressWarnings(min(price, na.rm = TRUE)),
max_price = suppressWarnings(max(price, na.rm = TRUE)),
.groups = "drop"
) %>%
mutate(range_price = max_price - min_price) %>%
inner_join(valid_states, by = c("state" = "state_name"))
plot_ly(
map_extremes,
type = "choropleth",
locationmode = "USA-states",
locations = ~state_abbr,
z = ~range_price,
text = ~paste0(
state,
"<br>Min: $", formatC(min_price, big.mark = ",", format = "f", digits = 0),
"<br>Max: $", formatC(max_price, big.mark = ",", format = "f", digits = 0)
),
colorscale = "Reds",
colorbar = list(title = "Price Range ($)")
) %>%
plotly::layout(
title = list(text = "Price Extrem Values by U.S. State (Max − Min)"),
geo = list(scope = "usa", projection = list(type = "albers usa"))
)Summarise your finding.